HEAD <<<<<<< HEAD
title: “Lab 5” author: “R4L” date: “2/13/2019” output: html_document —
team_milk <- milk %>%
filter(state %in% c("Texas", "Colorado", "California", "Tennessee", "Illinois"))
milk1996 <- milk %>%
filter(year == 1996)
ggplot(data= milk1996, aes(milk_million, fill = region), position = "fill")+
geom_histogram(bins = 15)+
ggtitle('Histogram of milk produced in 1996 by state')
milk1996 %>%
summarise(avg_milk_produced = mean(milk_million), median_milk_produced = median(milk_million))%>%
arrange(desc(avg_milk_produced))
## avg_milk_produced median_milk_produced
## 1 3080.12 1480
team_milk$average <- national$milk
ggplot(data = team_milk, aes(x = year, y = milk_million, color =state))+
geom_point()+
geom_smooth(aes(x = year, y = average), se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
The year when the most milk produced was in 2017 with production 4309.32 million lb.
arrange(national, milk) %>%
top_n(1)
## Selecting by milk
## # A tibble: 1 x 2
## year milk
## <dbl> <dbl>
## 1 2017 4309.
The year when the least milk produced was in 1975 with production 2307.96 million lb.
arrange(milk1996, desc(milk_million))%>%
top_n(1)
## Selecting by milk_million
## region state year milk_produced milk_million
## 1 Pacific California 1996 2.5848e+10 25848
descmilkyen <- milk1996 %>%
mutate(milkrank = -milk_million) %>%
arrange(-milk_million)
top_n(descmilkyen,1)%>%
select(region, state, year, milk_produced, milk_million)
## Selecting by milkrank
## region state year milk_produced milk_million
## 1 Pacific Alaska 1996 1.4e+07 14
milk1998 <- milk %>%
filter(year == 1998)
ggplot(data = milk1998, aes(x = as.factor(year), y = milk_million, fill = region)) +
geom_boxplot() +
ggtitle('Pounds of Milk Produced in 1998 by Region') +
scale_fill_discrete(name = 'Region') +
xlab('Year') +
ylab('Milk Produced (Millions lb)')
descnational <- national %>%
mutate(avg = -milk) %>%
arrange(-milk)
top_n(descnational,1)%>%
select(year, milk)
## Selecting by avg
## # A tibble: 1 x 2
## year milk
## <dbl> <dbl>
## 1 1975 2308.
milk1998 %>%
filter(year == 1998) %>%
summarise(avg_milk_produced = mean(milk_million),
median_milk_produced = median(milk_million))%>%
arrange(desc(avg_milk_produced))
## avg_milk_produced median_milk_produced
## 1 3145.22 1411.5
arrange(milk1998, desc(milk_million))%>%
top_n(1)
## Selecting by milk_million
## region state year milk_produced milk_million
## 1 Pacific California 1998 2.762e+10 27620
leastmilk <- milk1998 %>%
mutate(milkrank = -milk_million) %>%
arrange(-milk_million)
top_n(leastmilk,1)%>%
select(region, state, year, milk_produced, milk_million)
## Selecting by milkrank
## region state year milk_produced milk_million
## 1 Pacific Alaska 1998 1.4e+07 14
The 5 states that produced the most milk in 2017 was California with production 397,798 million lb.
filter(team_milk, year == 2017) %>%
arrange(desc(milk_million))
## region state year milk_produced milk_million average
## 1 Pacific California 2017 3.9798e+10 39798 4309.32
## 2 Southern Plains Texas 2017 1.2054e+10 12054 4171.94
## 3 Mountain Colorado 2017 4.1890e+09 4189 4248.12
## 4 Corn Belt Illinois 2017 1.9290e+09 1929 4024.60
## 5 Appalachian Tennessee 2017 6.9300e+08 693 4121.12
The 5 states that produced the least milk in 2017 was Tennessee with production 693 million lb.
filter(team_milk, year == 2017) %>%
arrange(milk_million)
## region state year milk_produced milk_million average
## 1 Appalachian Tennessee 2017 6.9300e+08 693 4121.12
## 2 Corn Belt Illinois 2017 1.9290e+09 1929 4024.60
## 3 Mountain Colorado 2017 4.1890e+09 4189 4248.12
## 4 Southern Plains Texas 2017 1.2054e+10 12054 4171.94
## 5 Pacific California 2017 3.9798e+10 39798 4309.32
milk1998<- milk%>%
filter(year==1998)
ggplot(data = milk1998)+
geom_point(aes(x=state, y=milk_million,color=region))+
theme(legend.position = "bottom")+
theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5))
ggtitle('Milk Produced in 1998')
## $title
## [1] "Milk Produced in 1998"
##
## attr(,"class")
## [1] "labels"
Average Milk Produced in 1998
milk1998%>%
summarise(Average_Milk_Produced_1998=mean(milk_million))
## Average_Milk_Produced_1998
## 1 3145.22
Median of Milk Produced in 1998
milk1998%>%
summarise(Median_of_Milk_Produced_1998=median(milk_million))
## Median_of_Milk_Produced_1998
## 1 1411.5
ggplot(data= milk1996, aes(milk_million, fill = region), position = "fill")+
geom_histogram(bins = 15)+
ggtitle('Histogram of milk produced in 1996 by state')+
xlab('Milk Produced (Millions lb)')
#### Report
The average milk produced in 1996 was 3080.12 million lb, and the median was 1480 million lb.
milk1996 %>%
summarise(avg_milk_produced = mean(milk_million), median_milk_produced = median(milk_million))%>%
arrange(desc(avg_milk_produced))
## avg_milk_produced median_milk_produced
## 1 3080.12 1480
The state that produced the most milk in 1996 was California, which they produced 25,848 million lb.
arrange(milk1996, desc(milk_million))%>%
top_n(1)
## Selecting by milk_million
## region state year milk_produced milk_million
## 1 Pacific California 1996 2.5848e+10 25848
The state that produced the least milk in 1996 was Alaska, which they produces 14 million lb.
descmilkyen <- milk1996 %>%
mutate(milkrank = -milk_million) %>%
arrange(-milk_million)
milk1998 <- milk %>%
filter(year == 1998)
ggplot(data = milk1998, aes(x = as.factor(year), y = milk_million, fill = region)) + geom_boxplot() +
ggtitle('Pounds of Milk Produced in 1998 by Region') +
scale_fill_discrete(name = 'Region') +
xlab('Year') +
ylab('Milk Produced (Millions lb)')
milk1998 %>%
filter(year == 1998) %>%
summarise(avg_milk_produced = mean(milk_million),
median_milk_produced = median(milk_million))%>%
arrange(desc(avg_milk_produced))
## avg_milk_produced median_milk_produced
## 1 3145.22 1411.5
arrange(milk1998, desc(milk_million))%>%
top_n(1)
## Selecting by milk_million
## region state year milk_produced milk_million
## 1 Pacific California 1998 2.762e+10 27620
leastmilk <- milk1998 %>%
mutate(milkrank = -milk_million) %>%
arrange(-milk_million)
top_n(leastmilk,1)%>%
select(region, state, year, milk_produced, milk_million)
## Selecting by milkrank
## region state year milk_produced milk_million
## 1 Pacific Alaska 1998 1.4e+07 14
milk1998 <- milk %>%
filter(year == 1998)
ggplot(data= milk1998, aes(milk_million, fill = region), position = "fill")+
geom_histogram(bins = 10)+
ggtitle('Histogram of milk produced in 1998 by state')
milk1998 %>%
summarise(avg_milk_produced = mean(milk_million), median_milk_produced = median(milk_million))%>%
arrange(desc(avg_milk_produced))
## avg_milk_produced median_milk_produced
## 1 3145.22 1411.5
arrange(milk1998, desc(milk_million)) %>%
top_n(1)
## Selecting by milk_million
## region state year milk_produced milk_million
## 1 Pacific California 1998 2.762e+10 27620
milk1998<- milk%>%
filter(year==1998)
ggplot(data = milk1998)+
geom_point(aes(x=state, y=milk_million,color=region))+
theme(legend.position = "bottom")+
theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5))+
ggtitle('Milk Produced in 1998')+
xlab('States')+
ylab('Milk in Million Gallons')
Average Milk Produced in 1998
milk1998%>%
summarise(Average_Milk_Produced_1998=mean(milk_million))
## Average_Milk_Produced_1998
## 1 3145.22
Median of Milk Produced in 1998
milk1998%>%
summarise(Median_of_Milk_Produced_1998=median(milk_million))
## Median_of_Milk_Produced_1998
## 1 1411.5
Most Milk Produced State in 1998
milk1998%>%
arrange(desc(milk_million))%>%
slice(1)
## region state year milk_produced milk_million
## 1 Pacific California 1998 2.762e+10 27620
Lest Milk Produced State in 1998
milk1998%>%
arrange(milk_million)%>%
slice(1)
## region state year milk_produced milk_million
## 1 Pacific Alaska 1998 1.4e+07 14
ggplot(data = team_milk, aes(x = year, y = milk_million, color =state))+
geom_point()+
geom_smooth(aes(x = year, y = average), se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
The year when the most milk produced was in 2017 with production 4309.32 million lb.
arrange(national, milk) %>%
top_n(1)
## Selecting by milk
## # A tibble: 1 x 2
## year milk
## <dbl> <dbl>
## 1 2017 4309.
The year when the least milk produced was in 1975 with production 2307.96 million lb.
descnational <- national %>%
mutate(avg = -milk) %>%
arrange(-milk)
top_n(descnational,1)%>%
select(year, milk)
## Selecting by avg
## # A tibble: 1 x 2
## year milk
## <dbl> <dbl>
## 1 1975 2308.
The 5 states that produced the most milk in 2017 was California with production 397,798 million lb.
filter(team_milk, year == 2017) %>%
arrange(desc(milk_million))
## region state year milk_produced milk_million average
## 1 Pacific California 2017 3.9798e+10 39798 4309.32
## 2 Southern Plains Texas 2017 1.2054e+10 12054 4171.94
## 3 Mountain Colorado 2017 4.1890e+09 4189 4248.12
## 4 Corn Belt Illinois 2017 1.9290e+09 1929 4024.60
## 5 Appalachian Tennessee 2017 6.9300e+08 693 4121.12
The 5 states that produced the least milk in 2017 was Tennessee with production 693 million lb.
filter(team_milk, year == 2017) %>%
arrange(milk_million)
## region state year milk_produced milk_million average
## 1 Appalachian Tennessee 2017 6.9300e+08 693 4121.12
## 2 Corn Belt Illinois 2017 1.9290e+09 1929 4024.60
## 3 Mountain Colorado 2017 4.1890e+09 4189 4248.12
## 4 Southern Plains Texas 2017 1.2054e+10 12054 4171.94
## 5 Pacific California 2017 3.9798e+10 39798 4309.32
In this project, our team learns how to use the data transformation to process data, for helping us to read the data much easier. By using data transformation, for example, filter() function helping us to ignore the unwanted data, and we can make a more accurate plot. Furthermore, we use summarise() function to calculate the average milk produced and median milk produced, and we find there is a huge difference between average and median. Therefore, analyze data by using data transformation is the most helpful things we learn by this work.
milk1996 <- milk %>%
filter(year == 1996)
ggplot(data= milk1996, aes(milk_million, fill = region), position = "fill")+
geom_histogram(bins = 15)+
ggtitle('Histogram of milk produced in 1996 by state')+
xlab('Milk Produced (Millions lb)')
#### Report
The average milk produced in 1996 was 3080.12 million lb, and the median was 1480 million lb.
milk1996 %>%
summarise(avg_milk_produced = mean(milk_million), median_milk_produced = median(milk_million))%>%
arrange(desc(avg_milk_produced))
## avg_milk_produced median_milk_produced
## 1 3080.12 1480
The state that produced the most milk in 1996 was California, which they produced 25,848 million lb.
arrange(milk1996, desc(milk_million))%>%
top_n(1)
## Selecting by milk_million
## region state year milk_produced milk_million
## 1 Pacific California 1996 2.5848e+10 25848
The state that produced the least milk in 1996 was Alaska, which they produces 14 million lb.
# descmilkyen <- milk1996 %>%
# mutate(milkrank = -milk_million) %>%
# arrange(-milk_million)
arrange(milk1996, milk_million)%>%
slice(1)
## region state year milk_produced milk_million
## 1 Pacific Alaska 1996 1.4e+07 14
milk1998 <- milk %>%
filter(year == 1998)
ggplot(data = milk1998, aes(x = as.factor(year), y = milk_million, fill = region)) + geom_boxplot() +
ggtitle('Pounds of Milk Produced in 1998 by Region') +
scale_fill_discrete(name = 'Region') +
xlab('Year') +
ylab('Milk Produced (Millions lb)')
The average milk produced in 1998 was 3145.22 million lb, and the median was 1411.5 million lb.
milk1998 %>%
filter(year == 1998) %>%
summarise(avg_milk_produced = mean(milk_million),
median_milk_produced = median(milk_million))%>%
arrange(desc(avg_milk_produced))
## avg_milk_produced median_milk_produced
## 1 3145.22 1411.5
The state that produced the most milk in 1998 was California, which they produced 27,620 million lb.
arrange(milk1998, desc(milk_million))%>%
top_n(1)
## Selecting by milk_million
## region state year milk_produced milk_million
## 1 Pacific California 1998 2.762e+10 27620
The state that produced the least milk in 1998 was Alaska, which they produces 14 million lb.
leastmilk <- milk1998 %>%
mutate(milkrank = -milk_million) %>%
arrange(-milk_million)
top_n(leastmilk,1)%>%
select(region, state, year, milk_produced, milk_million)
## Selecting by milkrank
## region state year milk_produced milk_million
## 1 Pacific Alaska 1998 1.4e+07 14
milk1998 <- milk %>%
filter(year == 1998)
ggplot(data= milk1998, aes(milk_million, fill = region), position = "fill")+
geom_histogram(bins = 10)+
ggtitle('Histogram of milk produced in 1998 by state')
milk1998 %>%
summarise(avg_milk_produced = mean(milk_million), median_milk_produced = median(milk_million))%>%
arrange(desc(avg_milk_produced))
## avg_milk_produced median_milk_produced
## 1 3145.22 1411.5
arrange(milk1998, desc(milk_million)) %>%
top_n(1)
## Selecting by milk_million
## region state year milk_produced milk_million
## 1 Pacific California 1998 2.762e+10 27620
arrange(milk1998, milk_million) %>%
slice(1)
## region state year milk_produced milk_million
## 1 Pacific Alaska 1998 1.4e+07 14
milk1998<- milk%>%
filter(year==1998)
ggplot(data = milk1998)+
geom_point(aes(x=state, y=milk_million,color=region))+
theme(legend.position = "bottom")+
theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5))+
ggtitle('Milk Produced in 1998')
milk1998%>%
summarise(Average_Milk_Produced_1998=mean(milk_million))
## Average_Milk_Produced_1998
## 1 3145.22
milk1998%>%
summarise(Median_of_Milk_Produced_1998=median(milk_million))
## Median_of_Milk_Produced_1998
## 1 1411.5
milk1998%>%
arrange(desc(milk_million))%>%
slice(1)
## region state year milk_produced milk_million
## 1 Pacific California 1998 2.762e+10 27620
milk1998%>%
arrange(milk_million)%>%
slice(1)
## region state year milk_produced milk_million
## 1 Pacific Alaska 1998 1.4e+07 14
library(tidyverse)
milk <- read.csv('state_milk_production.csv')
head(milk)
## region state year milk_produced
## 1 Northeast Maine 1970 6.19e+08
## 2 Northeast New Hampshire 1970 3.56e+08
## 3 Northeast Vermont 1970 1.97e+09
## 4 Northeast Massachusetts 1970 6.58e+08
## 5 Northeast Rhode Island 1970 7.50e+07
## 6 Northeast Connecticut 1970 6.61e+08
milk <- milk %>%
mutate(milk_million = milk_produced/1000000)
I make a histogram to show each states’s milk production in 1994. Then, the average milk produced by each state is 3072.04 million of pounds, the median milk produced by each states is 1513 million of pounds. Finally, California produced the most milk which is 25234 million of pounds, and Alaska produced the least amount of milk which is 13.
milk1994 <- milk %>%
filter(year == 1994)
ggplot(data=milk1994)+
geom_histogram(mapping = aes(x=state, y = milk_million), fill = "blue", stat = "identity")+
theme(axis.text.x=element_text(angle=90,
vjust = 0))+
ggtitle("The USA Milk Production in 1994")+
ylab("Milk Production")+
xlab("State")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
summarise(milk1994, Average_Milk_Produced = mean(milk_million, na.rm = TRUE), Median_Milk_Produced = median(milk_million, na.rm = TRUE))
## Average_Milk_Produced Median_Milk_Produced
## 1 3072.04 1513
arrange(milk1994, desc(milk_million))%>%
summarize(Most_Milk=first(milk_million), Most_Milk_state=first(state), Least_Milk=last(milk_million), Least_Milk_State=last(state))
## Most_Milk Most_Milk_state Least_Milk Least_Milk_State
## 1 25234 California 13 Alaska
title: “Lab 5” author: “R4L” date: “2/13/2019” output: html_document —
ggplot(data = team_milk, aes(x = year, y = milk_million, color =state))+
geom_point()+
geom_smooth(aes(x = year, y = average), se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
The year when the most milk produced was in 2017 with production 4309.32 million lb.
arrange(national, milk) %>%
top_n(1)
## Selecting by milk
## # A tibble: 1 x 2
## year milk
## <dbl> <dbl>
## 1 2017 4309.
The year when the least milk produced was in 1975 with production 2307.96 million lb.
descnational <- national %>%
mutate(avg = -milk) %>%
arrange(-milk)
top_n(descnational,1)%>%
select(year, milk)
## Selecting by avg
## # A tibble: 1 x 2
## year milk
## <dbl> <dbl>
## 1 1975 2308.
The 5 states that produced the most milk in 2017 was California with production 397,798 million lb.
filter(team_milk, year == 2017) %>%
arrange(desc(milk_million))
## region state year milk_produced milk_million average
## 1 Pacific California 2017 3.9798e+10 39798 4309.32
## 2 Southern Plains Texas 2017 1.2054e+10 12054 4171.94
## 3 Mountain Colorado 2017 4.1890e+09 4189 4248.12
## 4 Corn Belt Illinois 2017 1.9290e+09 1929 4024.60
## 5 Appalachian Tennessee 2017 6.9300e+08 693 4121.12
The 5 states that produced the least milk in 2017 was Tennessee with production 693 million lb.
filter(team_milk, year == 2017) %>%
arrange(milk_million)
## region state year milk_produced milk_million average
## 1 Appalachian Tennessee 2017 6.9300e+08 693 4121.12
## 2 Corn Belt Illinois 2017 1.9290e+09 1929 4024.60
## 3 Mountain Colorado 2017 4.1890e+09 4189 4248.12
## 4 Southern Plains Texas 2017 1.2054e+10 12054 4171.94
## 5 Pacific California 2017 3.9798e+10 39798 4309.32
milk1996 <- milk %>%
filter(year == 1996)
ggplot(data= milk1996, aes(milk_million, fill = region), position = "fill")+
geom_histogram(bins = 15)+
ggtitle('Histogram of milk produced in 1996 by state')+
xlab('Milk Produced (Millions lb)')
#### Report
The average milk produced in 1996 was 3080.12 million lb, and the median was 1480 million lb.
milk1996 %>%
summarise(avg_milk_produced = mean(milk_million), median_milk_produced = median(milk_million))%>%
arrange(desc(avg_milk_produced))
## avg_milk_produced median_milk_produced
## 1 3080.12 1480
The state that produced the most milk in 1996 was California, which they produced 25,848 million lb.
arrange(milk1996, desc(milk_million))%>%
top_n(1)
## Selecting by milk_million
## region state year milk_produced milk_million
## 1 Pacific California 1996 2.5848e+10 25848
The state that produced the least milk in 1996 was Alaska, which they produces 14 million lb.
descmilkyen <- milk1996 %>%
mutate(milkrank = -milk_million) %>%
arrange(-milk_million)
milk1998 <- milk %>%
filter(year == 1998)
ggplot(data = milk1998, aes(x = as.factor(year), y = milk_million, fill = region)) + geom_boxplot() +
ggtitle('Pounds of Milk Produced in 1998 by Region') +
scale_fill_discrete(name = 'Region') +
xlab('Year') +
ylab('Milk Produced (Millions lb)')
milk1998 %>%
filter(year == 1998) %>%
summarise(avg_milk_produced = mean(milk_million),
median_milk_produced = median(milk_million))%>%
arrange(desc(avg_milk_produced))
## avg_milk_produced median_milk_produced
## 1 3145.22 1411.5
arrange(milk1998, desc(milk_million))%>%
top_n(1)
## Selecting by milk_million
## region state year milk_produced milk_million
## 1 Pacific California 1998 2.762e+10 27620
leastmilk <- milk1998 %>%
mutate(milkrank = -milk_million) %>%
arrange(-milk_million)
top_n(leastmilk,1)%>%
select(region, state, year, milk_produced, milk_million)
## Selecting by milkrank
## region state year milk_produced milk_million
## 1 Pacific Alaska 1998 1.4e+07 14
milk1998 <- milk %>%
filter(year == 1998)
ggplot(data= milk1998, aes(milk_million, fill = region), position = "fill")+
geom_histogram(bins = 10)+
ggtitle('Histogram of milk produced in 1998 by state')
milk1998 %>%
summarise(avg_milk_produced = mean(milk_million), median_milk_produced = median(milk_million))%>%
arrange(desc(avg_milk_produced))
## avg_milk_produced median_milk_produced
## 1 3145.22 1411.5
arrange(milk1998, desc(milk_million)) %>%
top_n(1)
## Selecting by milk_million
## region state year milk_produced milk_million
## 1 Pacific California 1998 2.762e+10 27620
milk1998<- milk%>%
filter(year==1998)
ggplot(data = milk1998)+
geom_point(aes(x=state, y=milk_million,color=region))+
theme(legend.position = "bottom")+
theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5))+
ggtitle('Milk Produced in 1998')
milk1998%>%
summarise(Average_Milk_Produced_1998=mean(milk_million))
## Average_Milk_Produced_1998
## 1 3145.22
milk1998%>%
summarise(Median_of_Milk_Produced_1998=median(milk_million))
## Median_of_Milk_Produced_1998
## 1 1411.5
milk1998%>%
arrange(desc(milk_million))%>%
slice(1)
## region state year milk_produced milk_million
## 1 Pacific California 1998 2.762e+10 27620
milk1998%>%
arrange(milk_million)%>%
slice(1)
## region state year milk_produced milk_million
## 1 Pacific Alaska 1998 1.4e+07 14